In [1]:
# importing dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import metrics
import numpy as np
from scipy.stats import skew, kurtosis
from scipy import stats
import re
In [2]:
#Data EDA and processing
df = pd.read_csv('Car details v3.csv')
df.head()
Out[2]:
name year selling_price km_driven fuel seller_type transmission owner mileage engine max_power torque seats
0 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual Manual First Owner 23.4 kmpl 1248 CC 74 bhp 190Nm@ 2000rpm 5.0
1 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual Manual Second Owner 21.14 kmpl 1498 CC 103.52 bhp 250Nm@ 1500-2500rpm 5.0
2 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual Manual Third Owner 17.7 kmpl 1497 CC 78 bhp 12.7@ 2,700(kgm@ rpm) 5.0
3 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual Manual First Owner 23.0 kmpl 1396 CC 90 bhp 22.4 kgm at 1750-2750rpm 5.0
4 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual Manual First Owner 16.1 kmpl 1298 CC 88.2 bhp 11.5@ 4,500(kgm@ rpm) 5.0
In [3]:
print("Shape:", df.shape)
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
Shape: (8128, 13)
Number of Rows: 8128
Number of Columns: 13
In [4]:
display(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB
None
In [5]:
df.isnull().sum()
Out[5]:
name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64
In [6]:
df = df.dropna(subset=[
    "engine",
    "max_power",
    "mileage"
])
In [7]:
df["seats"].fillna(df["seats"].mode()[0], inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\356958492.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["seats"].fillna(df["seats"].mode()[0], inplace=True)
In [8]:
print(df.isnull().sum())
name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           1
seats            0
dtype: int64
In [9]:
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical Columns:", numerical_cols)
Numerical Columns: ['year', 'selling_price', 'km_driven', 'seats']
In [10]:
df[numerical_cols].describe().T
Out[10]:
count mean std min 25% 50% 75% max
year 7907.0 2013.982168 3.866650 1994.0 2012.0 2015.0 2017.0 2020.0
selling_price 7907.0 649741.656380 813556.530537 29999.0 270000.0 450000.0 690000.0 10000000.0
km_driven 7907.0 69192.556469 56789.761602 1.0 35000.0 60000.0 95750.0 2360457.0
seats 7907.0 5.416719 0.959588 2.0 5.0 5.0 5.0 14.0
In [11]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
Categorical Columns: ['name', 'fuel', 'seller_type', 'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque']
In [12]:
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(f"Unique values in '{col}': {df[col].unique()}")
    print(df[col].describe())
Column: name
Unique values in 'name': ['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
count                       7907
unique                      1983
top       Maruti Swift Dzire VDI
freq                         129
Name: name, dtype: object

Column: fuel
Unique values in 'fuel': ['Diesel' 'Petrol' 'LPG' 'CNG']
count       7907
unique         4
top       Diesel
freq        4299
Name: fuel, dtype: object

Column: seller_type
Unique values in 'seller_type': ['Individual' 'Dealer' 'Trustmark Dealer']
count           7907
unique             3
top       Individual
freq            6564
Name: seller_type, dtype: object

Column: transmission
Unique values in 'transmission': ['Manual' 'Automatic']
count       7907
unique         2
top       Manual
freq        6866
Name: transmission, dtype: object

Column: owner
Unique values in 'owner': ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
count            7907
unique              5
top       First Owner
freq             5215
Name: owner, dtype: object

Column: mileage
Unique values in 'mileage': ['23.4 kmpl' '21.14 kmpl' '17.7 kmpl' '23.0 kmpl' '16.1 kmpl' '20.14 kmpl'
 '17.3 km/kg' '23.59 kmpl' '20.0 kmpl' '19.01 kmpl' '17.3 kmpl'
 '19.3 kmpl' '18.9 kmpl' '18.15 kmpl' '24.52 kmpl' '19.7 kmpl'
 '22.54 kmpl' '21.0 kmpl' '25.5 kmpl' '26.59 kmpl' '21.5 kmpl' '20.3 kmpl'
 '21.4 kmpl' '24.7 kmpl' '18.2 kmpl' '16.8 kmpl' '24.3 kmpl' '14.0 kmpl'
 '18.6 kmpl' '33.44 km/kg' '23.95 kmpl' '17.0 kmpl' '20.63 kmpl'
 '13.93 kmpl' '16.0 kmpl' '17.8 kmpl' '18.5 kmpl' '12.55 kmpl'
 '12.99 kmpl' '14.8 kmpl' '13.5 kmpl' '26.0 kmpl' '20.65 kmpl' '27.3 kmpl'
 '11.36 kmpl' '17.68 kmpl' '14.28 kmpl' '18.53 kmpl' '14.84 kmpl'
 '21.12 kmpl' '20.36 kmpl' '21.27 kmpl' '18.16 kmpl' '22.0 kmpl'
 '25.1 kmpl' '20.51 kmpl' '21.66 kmpl' '25.2 kmpl' '22.9 kmpl'
 '16.02 kmpl' '20.54 kmpl' '22.77 kmpl' '15.71 kmpl' '23.1 kmpl'
 '19.02 kmpl' '19.81 kmpl' '26.2 km/kg' '16.47 kmpl' '15.04 kmpl'
 '19.1 kmpl' '21.79 kmpl' '18.8 kmpl' '21.21 kmpl' '15.37 kmpl'
 '11.79 kmpl' '19.0 kmpl' '14.3 kmpl' '15.8 kmpl' '15.1 kmpl' '19.09 kmpl'
 '22.32 kmpl' '21.9 kmpl' '14.53 kmpl' '21.63 kmpl' '20.85 kmpl'
 '20.45 kmpl' '19.67 kmpl' '23.01 kmpl' '20.77 kmpl' '17.92 kmpl'
 '17.01 kmpl' '22.37 kmpl' '19.33 kmpl' '9.5 kmpl' '12.83 kmpl'
 '22.48 kmpl' '16.78 kmpl' '14.67 kmpl' '15.0 kmpl' '13.96 kmpl'
 '18.0 kmpl' '12.07 kmpl' '26.21 kmpl' '10.8 kmpl' '16.3 kmpl' '13.6 kmpl'
 '14.74 kmpl' '15.6 kmpl' '19.56 kmpl' '22.69 kmpl' '19.16 kmpl'
 '18.12 kmpl' '12.1 kmpl' '17.5 kmpl' '42.0 kmpl' '20.4 kmpl' '21.1 kmpl'
 '19.44 kmpl' '13.0 kmpl' '21.43 kmpl' '22.95 kmpl' '16.2 kmpl'
 '15.3 kmpl' '28.09 kmpl' '17.4 kmpl' '19.4 kmpl' '26.6 km/kg' '17.6 kmpl'
 '28.4 kmpl' '14.1 kmpl' '25.17 kmpl' '22.74 kmpl' '17.57 kmpl'
 '16.95 kmpl' '19.49 kmpl' '17.21 kmpl' '13.2 kmpl' '14.2 kmpl'
 '26.8 kmpl' '25.4 kmpl' '11.5 kmpl' '27.28 kmpl' '17.97 kmpl' '12.8 kmpl'
 '16.55 kmpl' '12.05 kmpl' '14.07 kmpl' '21.02 kmpl' '11.57 kmpl'
 '17.9 kmpl' '15.96 kmpl' '17.1 kmpl' '17.19 kmpl' '21.01 kmpl'
 '24.0 kmpl' '25.6 kmpl' '21.38 kmpl' '23.84 kmpl' '23.08 kmpl'
 '14.24 kmpl' '20.71 kmpl' '15.64 kmpl' '14.5 kmpl' '16.34 kmpl'
 '27.39 kmpl' '11.1 kmpl' '13.9 kmpl' '20.88 km/kg' '20.92 kmpl'
 '23.8 kmpl' '24.4 kmpl' '15.29 kmpl' '21.19 kmpl' '22.5 kmpl' '19.6 kmpl'
 '23.65 kmpl' '25.32 kmpl' '23.5 kmpl' '16.6 kmpl' '23.9 kmpl' '20.8 kmpl'
 '27.62 kmpl' '12.9 kmpl' '25.44 kmpl' '17.88 kmpl' '22.7 kmpl'
 '17.2 kmpl' '15.42 kmpl' '19.68 kmpl' '18.7 kmpl' '15.4 kmpl'
 '19.34 kmpl' '22.71 kmpl' '25.8 kmpl' '13.7 kmpl' '12.2 kmpl'
 '18.49 kmpl' '9.0 kmpl' '0.0 kmpl' '13.58 kmpl' '10.1 kmpl' '20.5 kmpl'
 '25.0 kmpl' '10.5 kmpl' '22.07 kmpl' '22.3 kmpl' '15.26 kmpl'
 '20.62 kmpl' '27.4 kmpl' '23.2 kmpl' '14.4 kmpl' '18.4 kmpl'
 '30.46 km/kg' '14.02 kmpl' '11.0 kmpl' '20.6 kmpl' '22.05 kmpl'
 '20.2 kmpl' '18.1 kmpl' '22.1 kmpl' '19.87 kmpl' '13.01 kmpl'
 '18.06 kmpl' '26.1 kmpl' '16.52 kmpl' '13.55 kmpl' '24.2 kmpl'
 '25.83 kmpl' '11.2 kmpl' '17.09 kmpl' '21.03 kmpl' '17.45 kmpl'
 '21.64 kmpl' '21.94 km/kg' '13.87 kmpl' '19.98 kmpl' '20.52 kmpl'
 '23.57 kmpl' '11.7 kmpl' '17.43 kmpl' '18.88 kmpl' '13.68 kmpl'
 '11.18 kmpl' '20.89 kmpl' '11.8 kmpl' '19.62 kmpl' '21.7 kmpl'
 '14.9 kmpl' '19.5 kmpl' '10.91 kmpl' '15.7 kmpl' '20.73 kmpl'
 '15.85 kmpl' '20.7 kmpl' '14.23 kmpl' '16.5 kmpl' '17.36 kmpl'
 '12.6 kmpl' '16.36 kmpl' '14.95 kmpl' '16.9 kmpl' '19.2 kmpl'
 '16.96 kmpl' '22.15 kmpl' '18.78 kmpl' '19.61 kmpl' '17.71 kmpl'
 '18.3 kmpl' '19.12 kmpl' '19.72 kmpl' '12.0 kmpl' '11.4 kmpl'
 '23.03 kmpl' '11.07 kmpl' '15.9 kmpl' '17.67 kmpl' '20.46 kmpl'
 '13.1 kmpl' '13.45 km/kg' '24.8 kmpl' '15.73 kmpl' '15.11 kmpl'
 '12.7 kmpl' '21.2 kmpl' '20.38 kmpl' '21.56 kmpl' '13.22 kmpl'
 '14.49 kmpl' '15.05 kmpl' '23.26 kmpl' '15.41 kmpl' '13.8 kmpl'
 '22.27 kmpl' '32.52 km/kg' '14.66 kmpl' '12.12 kmpl' '16.84 kmpl'
 '14.09 kmpl' '14.7 kmpl' '13.4 kmpl' '15.5 kmpl' '13.49 kmpl'
 '11.88 km/kg' '14.6 kmpl' '10.75 kmpl' '24.5 kmpl' '11.74 kmpl'
 '16.07 kmpl' '15.63 kmpl' '26.3 km/kg' '23.7 km/kg' '25.47 kmpl'
 '17.05 kmpl' '23.3 kmpl' '11.9 kmpl' '13.38 kmpl' '20.86 kmpl'
 '19.2 km/kg' '10.9 kmpl' '18.25 kmpl' '15.2 kmpl' '20.37 kmpl'
 '17.8 km/kg' '21.8 kmpl' '11.96 kmpl' '24.04 kmpl' '19.69 kmpl'
 '13.73 kmpl' '21.04 kmpl' '25.01 kmpl' '10.93 kmpl' '10.9 km/kg'
 '24.29 kmpl' '13.44 kmpl' '20.07 kmpl' '21.1 km/kg' '19.08 kmpl'
 '20.34 kmpl' '11.68 kmpl' '12.5 kmpl' '12.3 kmpl' '23.87 kmpl'
 '16.38 kmpl' '17.42 kmpl' '10.0 kmpl' '18.24 kmpl' '10.71 kmpl'
 '19.59 kmpl' '16.7 kmpl' '19.83 kmpl' '21.76 kmpl' '16.05 kmpl'
 '20.28 kmpl' '16.25 kmpl' '16.73 kmpl' '18.48 kmpl' '13.2 km/kg'
 '21.4 km/kg' '14.99 kmpl' '18.76 kmpl' '16.4 kmpl' '19.64 kmpl'
 '14.94 kmpl' '16.6 km/kg' '16.0 km/kg' '17.11 kmpl' '22.8 km/kg'
 '32.26 km/kg' '33.0 km/kg' '12.4 kmpl' '18.44 kmpl' '16.09 kmpl'
 '19.0 km/kg' '12.62 kmpl' '21.13 kmpl' '15.17 kmpl' '21.73 kmpl'
 '21.72 kmpl' '12.85 kmpl' '14.81 kmpl' '13.24 kmpl' '14.4 km/kg'
 '21.49 kmpl' '14.62 kmpl' '26.83 km/kg' '11.45 kmpl' '12.08 kmpl'
 '15.74 kmpl' '11.3 kmpl' '15.1 km/kg' '14.21 kmpl' '11.72 kmpl'
 '16.51 kmpl']
count          7907
unique          393
top       18.9 kmpl
freq            225
Name: mileage, dtype: object

Column: engine
Unique values in 'engine': ['1248 CC' '1498 CC' '1497 CC' '1396 CC' '1298 CC' '1197 CC' '1061 CC'
 '796 CC' '1364 CC' '1399 CC' '1461 CC' '993 CC' '1198 CC' '1199 CC'
 '998 CC' '1591 CC' '2179 CC' '1368 CC' '2982 CC' '2494 CC' '2143 CC'
 '2477 CC' '1462 CC' '2755 CC' '1968 CC' '1798 CC' '1196 CC' '1373 CC'
 '1598 CC' '1998 CC' '1086 CC' '1194 CC' '1172 CC' '1405 CC' '1582 CC'
 '999 CC' '2487 CC' '1999 CC' '3604 CC' '2987 CC' '1995 CC' '1451 CC'
 '1969 CC' '2967 CC' '2497 CC' '1797 CC' '1991 CC' '2362 CC' '1493 CC'
 '1599 CC' '1341 CC' '1794 CC' '799 CC' '1193 CC' '2696 CC' '1495 CC'
 '1186 CC' '1047 CC' '2498 CC' '2956 CC' '2523 CC' '1120 CC' '624 CC'
 '1496 CC' '1984 CC' '2354 CC' '814 CC' '793 CC' '1799 CC' '936 CC'
 '1956 CC' '1997 CC' '1499 CC' '1948 CC' '2997 CC' '2489 CC' '2499 CC'
 '2609 CC' '2953 CC' '1150 CC' '1994 CC' '1388 CC' '1527 CC' '2199 CC'
 '995 CC' '2993 CC' '1586 CC' '1390 CC' '909 CC' '2393 CC' '3198 CC'
 '1339 CC' '2835 CC' '2092 CC' '1595 CC' '2496 CC' '1596 CC' '1597 CC'
 '2596 CC' '2148 CC' '1299 CC' '1590 CC' '2231 CC' '2694 CC' '2200 CC'
 '1795 CC' '1896 CC' '1796 CC' '1422 CC' '1489 CC' '2359 CC' '2197 CC'
 '2999 CC' '1781 CC' '2650 CC' '1343 CC' '2446 CC' '3498 CC' '2198 CC'
 '2776 CC' '1950 CC']
count        7907
unique        121
top       1248 CC
freq         1017
Name: engine, dtype: object

Column: max_power
Unique values in 'max_power': ['74 bhp' '103.52 bhp' '78 bhp' '90 bhp' '88.2 bhp' '81.86 bhp' '57.5 bhp'
 '37 bhp' '67.1 bhp' '68.1 bhp' '108.45 bhp' '60 bhp' '73.9 bhp' '67 bhp'
 '82 bhp' '88.5 bhp' '46.3 bhp' '88.73 bhp' '64.1 bhp' '98.6 bhp'
 '88.8 bhp' '83.81 bhp' '83.1 bhp' '47.3 bhp' '73.8 bhp' '34.2 bhp'
 '35 bhp' '81.83 bhp' '40.3 bhp' '121.3 bhp' '138.03 bhp' '160.77 bhp'
 '117.3 bhp' '116.3 bhp' '83.14 bhp' '67.05 bhp' '168.5 bhp' '100 bhp'
 '120.7 bhp' '98.63 bhp' '175.56 bhp' '103.25 bhp' '171.5 bhp' '100.6 bhp'
 '174.33 bhp' '187.74 bhp' '170 bhp' '78.9 bhp' '88.76 bhp' '86.8 bhp'
 '108.495 bhp' '108.62 bhp' '93.7 bhp' '103.6 bhp' '98.59 bhp' '189 bhp'
 '67.04 bhp' '68.05 bhp' '58.2 bhp' '82.85 bhp' '81.80 bhp' '73 bhp'
 '120 bhp' '94.68 bhp' '160 bhp' '65 bhp' '155 bhp' '69.01 bhp'
 '126.32 bhp' '138.1 bhp' '83.8 bhp' '126.2 bhp' '98.96 bhp' '62.1 bhp'
 '86.7 bhp' '188 bhp' '214.56 bhp' '177 bhp' '280 bhp' '148.31 bhp'
 '254.79 bhp' '190 bhp' '177.46 bhp' '204 bhp' '141 bhp' '117.6 bhp'
 '241.4 bhp' '282 bhp' '150 bhp' '147.5 bhp' '108.5 bhp' '103.5 bhp'
 '183 bhp' '181.04 bhp' '157.7 bhp' '164.7 bhp' '91.1 bhp' '400 bhp'
 '68 bhp' '75 bhp' '85.8 bhp' '87.2 bhp' '53 bhp' '118 bhp' '103.2 bhp'
 '83 bhp' '84 bhp' '58.16 bhp' '147.94 bhp' '74.02 bhp' '53.3 bhp'
 '80 bhp' '88.7 bhp' '97.7 bhp' '121.36 bhp' '162 bhp' '140 bhp' '94 bhp'
 '100.57 bhp' '82.9 bhp' '83.11 bhp' '70 bhp' '153.86 bhp' '121 bhp'
 '126.3 bhp' '73.97 bhp' '171 bhp' '69 bhp' '99.6 bhp' '102 bhp' '105 bhp'
 '63 bhp' '79.4 bhp' '97.9 bhp' '63.1 bhp' '66.1 bhp' '110 bhp'
 '174.5 bhp' '53.26 bhp' '73.75 bhp' '67.06 bhp' '64.08 bhp' '37.5 bhp'
 '189.3 bhp' '158.8 bhp' '61.7 bhp' '55.2 bhp' '71.01 bhp' '73.74 bhp'
 '147.9 bhp' '71 bhp' '77 bhp' '121.4 bhp' '113.4 bhp' '47 bhp' '130 bhp'
 '57.6 bhp' '138 bhp' '52.8 bhp' '53.64 bhp' '53.5 bhp' '76.8 bhp'
 '82.4 bhp' '113.42 bhp' '76 bhp' '84.8 bhp' '56.3 bhp' '218 bhp'
 '112 bhp' '92 bhp' '105.5 bhp' '169 bhp' '95 bhp' '72.4 bhp' '115 bhp'
 '152 bhp' '91.2 bhp' '156 bhp' '74.9 bhp' '62 bhp' '105.3 bhp'
 '73.94 bhp' '85.80 bhp' '85 bhp' '118.3 bhp' '72 bhp' '147.51 bhp'
 '58 bhp' '64 bhp' '126.24 bhp' '76.9 bhp' '194.3 bhp' '99.23 bhp'
 '89.84 bhp' '123.7 bhp' '118.35 bhp' '99 bhp' '241 bhp' '136 bhp'
 '261.4 bhp' '104.68 bhp' '37.48 bhp' '104 bhp' '88.50 bhp' '63.12 bhp'
 '91.7 bhp' '102.5 bhp' '177.6 bhp' '45 bhp' '123.37 bhp' '147.8 bhp'
 '184 bhp' '84.48 bhp' '68.07 bhp' '74.96 bhp' '167.6 bhp' '152.87 bhp'
 '112.2 bhp' '83.83 bhp' '197 bhp' '110.4 bhp' '104.55 bhp' '103 bhp'
 '103.3 bhp' '66 bhp' '108.6 bhp' '165 bhp' '163.7 bhp' '116.9 bhp'
 '94.93 bhp' '127 bhp' '198.5 bhp' '179.5 bhp' '120.69 bhp' '121.31 bhp'
 '138.08 bhp' '187.7 bhp' '80.8 bhp' '86.79 bhp' '93.87 bhp' '116.6 bhp'
 '143 bhp' '92.7 bhp' '88 bhp' '58.33 bhp' '78.8 bhp' '64.4 bhp' '125 bhp'
 '139.01 bhp' '254.8 bhp' '181 bhp' '258 bhp' '55.23 bhp' '270.9 bhp'
 '265 bhp' '157.75 bhp' '101 bhp' '186 bhp' '187.4 bhp' '224 bhp'
 '64.9 bhp' '148 bhp' '35.5 bhp' '89.75 bhp' '32.8 bhp' '91.72 bhp'
 '106 bhp' '98.97 bhp' '66.6 bhp' '86 bhp' '65.3 bhp' '98.82 bhp'
 '198.25 bhp' '38 bhp' '142 bhp' '132 bhp' '174.57 bhp' '178 bhp'
 '163.2 bhp' '203.2 bhp' '177.5 bhp' '175 bhp' '57 bhp' '80.84 bhp'
 '68.4 bhp' '167.67 bhp' '170.63 bhp' '52 bhp' '149.5 bhp' '48.21 bhp'
 ' bhp' '201.1 bhp' '100.5 bhp' '144 bhp' '194.4 bhp' '168.7 bhp'
 '104.5 bhp' '103.26 bhp' '116.4 bhp' '98.79 bhp' '80.9 bhp' '58.3 bhp'
 '272 bhp' '235 bhp' '167.62 bhp' '170.30 bhp' '139.46 bhp' '158 bhp'
 '110.5 bhp' '82.5 bhp' '141.1 bhp' '38.4 bhp' '197.2 bhp' '161 bhp'
 '194 bhp' '122.4 bhp' '134.10 bhp' '60.2 bhp' '134 bhp' '203 bhp'
 '135.1 bhp']
count       7907
unique       321
top       74 bhp
freq         377
Name: max_power, dtype: object

Column: torque
Unique values in 'torque': ['190Nm@ 2000rpm' '250Nm@ 1500-2500rpm' '12.7@ 2,700(kgm@ rpm)'
 '22.4 kgm at 1750-2750rpm' '11.5@ 4,500(kgm@ rpm)' '113.75nm@ 4000rpm'
 '7.8@ 4,500(kgm@ rpm)' '59Nm@ 2500rpm' '170Nm@ 1800-2400rpm'
 '160Nm@ 2000rpm' '248Nm@ 2250rpm' '78Nm@ 4500rpm' '84Nm@ 3500rpm'
 '115Nm@ 3500-3600rpm' '200Nm@ 1750rpm' '62Nm@ 3000rpm'
 '219.7Nm@ 1500-2750rpm' '114Nm@ 3500rpm' '115Nm@ 4000rpm' '69Nm@ 3500rpm'
 '172.5Nm@ 1750rpm' '6.1kgm@ 3000rpm' '114.7Nm@ 4000rpm' '60Nm@ 3500rpm'
 '90Nm@ 3500rpm' '151Nm@ 4850rpm' '104Nm@ 4000rpm' '320Nm@ 1700-2700rpm'
 '250Nm@ 1750-2500rpm' '145Nm@ 4600rpm' '146Nm@ 4800rpm'
 '343Nm@ 1400-3400rpm' '200Nm@ 1400-3400rpm' '200Nm@ 1250-4000rpm'
 '400Nm@ 2000-2500rpm' '138Nm@ 4400rpm' '360Nm@ 1200-3400rpm'
 '200Nm@ 1200-3600rpm' '380Nm@ 1750-2500rpm' '173Nm@ 4000rpm'
 '400Nm@ 1750-3000rpm' '400Nm@ 1400-2800rpm' '200Nm@ 1750-3000rpm'
 '111.7Nm@ 4000rpm' '219.6Nm@ 1500-2750rpm' '112Nm@ 4000rpm'
 '250Nm@ 1500-3000rpm' '130Nm@ 4000rpm' '205Nm@ 1750-3250rpm'
 '280Nm@ 1350-4600rpm' '99.04Nm@ 4500rpm' '77Nm@ 3500rpm' '110Nm@ 3750rpm'
 '153Nm@ 3800rpm' '113.7Nm@ 4000rpm' '114Nm@ 4000rpm' '113Nm@ 4200rpm'
 '101Nm@ 3000rpm' '290Nm@ 1800-2800rpm' '120Nm@ 4250rpm'
 '250Nm@ 1500~4500rpm' '96 Nm at 3000 rpm' '360Nm@ 1750-2800rpm'
 '135Nm@ 2500rpm' '259.8Nm@ 1900-2750rpm' '200Nm@ 1900rpm'
 '259.9Nm@ 1900-2750rpm' '91Nm@ 4250rpm' '96.1Nm@ 3000rpm'
 '109Nm@ 4500rpm' '400nm@ 1750-3000rpm' '202Nm@ 3600-5200rpm'
 '430Nm@ 1750-2500rpm' '347Nm@ 4300rpm' '382nm@ 1750-2250rpm'
 '620Nm@ 1600-2400rpm' '400Nm@ 1750-2500rpm' '250@ 1250-5000rpm'
 '500Nm@ 1600-1800rpm' '250Nm@ 1600-3600rpm' '400Nm' '550Nm@ 1750-2750rpm'
 '490Nm@ 1600rpm' '250 Nm at 2750 rpm' '177.5Nm@ 4700rpm'
 '170Nm@ 1750-4000rpm' '300Nm@ 1200-4000rpm' '300Nm@ 1200-1400rpm'
 '260Nm@ 1500-2750rpm' '213Nm@ 4500rpm' '224Nm@ 4000rpm' '640Nm@ 1740rpm'
 '113Nm@ 4500rpm' '95Nm@ 3000-4300rpm' '13.1kgm@ 4600rpm'
 '205Nm@ 1800-2800rpm' '71Nm@ 3500rpm' '190Nm@ 1750-3000rpm'
 '146Nm at 4800 rpm' '14.9 KGM at 3000 RPM' '115Nm@ 3200rpm'
 '117nm@ 4000rpm' '320Nm@ 1500-3000rpm' '72Nm@ 4386rpm'
 '11.4 kgm at 4,000 rpm' '140Nm@ 1500-4000rpm' '134Nm@ 4000rpm'
 '150Nm@ 4500rpm' '340Nm@ 1800-3250rpm' '240Nm@ 1600-2800rpm'
 '330Nm@ 1600-2800rpm' '12.5@ 3,500(kgm@ rpm)' '110Nm@ 4800rpm'
 '111.8Nm@ 4000rpm' '11.8@ 3,200(kgm@ rpm)' '135.4Nm@ 2500rpm'
 '300Nm@ 1750-2500rpm' '190.25nm@ 1750-2250rpm' '140Nm@ 1800-3000rpm'
 '20.4@ 1400-3400(kgm@ rpm)' '247Nm@ 1800-2000rpm' '223Nm@ 1600-2200rpm'
 '180 Nm at 1440-1500rpm' '195Nm@ 1400-2200rpm' '154.9Nm@ 4200rpm'
 '114.73Nm@ 4000rpm' '160Nm@ 1500-2750rpm' '108Nm@ 4400rpm'
 '190.24nm@ 1750-2250rpm' '200Nm@ 2000-3500rpm' '420Nm@ 1400-2600rpm'
 '100Nm@ 2700rpm' '51Nm@ 4000rpm' '250Nm@ 1250-5300rpm' '132Nm@ 3000rpm'
 '350Nm@ 1500-2750rpm' '218Nm@ 4200rpm' '14.9@ 3,000(kgm@ rpm)'
 '24@ 1,900-2,750(kgm@ rpm)' '13.5@ 2,500(kgm@ rpm)' '85Nm@ 3000rpm'
 '74.5Nm@ 4000rpm' '160Nm@ 1750rpm' '180.4Nm@ 1750-2500rpm'
 '230Nm@ 1500-2500rpm' '113.75Nm@ 4000rpm' '219.66nm@ 1500-2750rpm'
 '245Nm@ 1750rpm' '360Nm@ 1400-3200rpm' '320Nm@ 2000rpm'
 '135 Nm at 2500  rpm ' '24 KGM at 1900-2750 RPM' '190Nm@ 1750-2250rpm'
 '204Nm@ 2000-2750rpm' '14.3@ 1,800-3,000(kgm@ rpm)' '250nm@ 1500-2750rpm'
 '125Nm@ 2000rpm' '172Nm@ 4300rpm' '150Nm@ 1750rpm' '102Nm@ 4000rpm'
 '85Nm@ 2500rpm' '8.5@ 2,500(kgm@ rpm)' '180Nm@ 1440-1500rpm'
 '106.5Nm@ 4400rpm' '108.5Nm@ 5000rpm' '350Nm@ 1750-2500rpm'
 '144.15nm@ 4500rpm' '104Nm@ 4400rpm' '99Nm@ 4500rpm' '200Nm@ 2000rpm'
 '280Nm@ 1800-2800rpm' '142.5Nm@ 1750rpm' '140Nm@ 4400rpm'
 '115@ 2,500(kgm@ rpm)' '196Nm@ 5000rpm' '260 Nm at 1800-2200 rpm'
 '9.8@ 3,000(kgm@ rpm)' '209Nm@ 2000rpm' '135 Nm at 2500 rpm'
 '140Nm@ 4200rpm' '220Nm at 1400-2600 rpm' '48Nm@ 3000rpm'
 '171Nm@ 1800rpm' '277.5Nm@ 1700-2200rpm' '215Nm@ 3600rpm'
 '219.6Nm@ 1750-2750rpm' '195Nm@ 1440-2200rpm' '13@ 2,500(kgm@ rpm)'
 '180Nm@ 2000rpm' '200Nm@ 1400-2200rpm' '380Nm(38.7kgm)@ 2500rpm'
 '110Nm@ 4400rpm' '72Nm@ 4388rpm' '263.7Nm@ 2500rpm' '320Nm@ 1600-2800rpm'
 '25.5@ 1,500-3,000(kgm@ rpm)' '16.3@ 2,000(kgm@ rpm)'
 '190 Nm at 1750 rpm ' '94.14Nm@ 3500rpm' '12@ 3,500(kgm@ rpm)'
 '113Nm@ 5000rpm' '280Nm@ 2400-2800rpm' '96Nm@ 3500rpm'
 '16@ 2,000(kgm@ rpm)' '320Nm@ 1750-3000rpm' '114.73nm@ 4000rpm'
 '320Nm@ 1750-2500rpm' '138nm@ 4400rpm' '190Nm@ 1750rpm' '789Nm@ 2250rpm'
 '259.87Nm@ 1900-2750rpm' '205Nm@ 1750rpm' '436.39Nm@ 1800-2500rpm'
 '182.5Nm@ 1500-1800rpm' '90.3Nm@ 4200rpm' '12.5@ 2,500(kgm@ rpm)'
 '215Nm@ 1750-3000rpm' '215Nm@ 1750-3000' '305Nm@ 2000rpm'
 '540Nm@ 2000rpm' '327Nm@ 2600rpm' '300Nm@ 1600-3000rpm'
 '620Nm@ 2000-2500rpm' '450Nm@ 1600-2400rpm' '19@ 1,800(kgm@ rpm)'
 '9.2@ 4,200(kgm@ rpm)' '145@ 4,100(kgm@ rpm)' '51Nm@ 4000+/-500rpm'
 '110Nm@ 3000rpm' '148Nm@ 3500rpm' '116Nm@ 4750rpm'
 '48@ 3,000+/-500(NM@ rpm)' '148Nm@ 4000rpm' '222Nm@ 4300rpm'
 '135.3Nm@ 5000rpm' '98Nm@ 1600-3000rpm' '170Nm@ 1400-4500rpm'
 '343Nm@ 1400-2800rpm' '402Nm@ 1600-3000rpm' '113Nm@ 3300rpm'
 '99.07Nm@ 4500rpm' '210nm@ 1600-2200rpm' '190 Nm at 1750  rpm '
 '32.1kgm@ 2000rpm' '224nm@ 1500-2750rpm' '400nm@ 1750-2500rpm'
 '215Nm@ 1750-2500rpm' '25@ 1,800-2,800(kgm@ rpm)' '197Nm@ 1750rpm'
 '136.3Nm@ 4200rpm' '470Nm@ 1750-2500rpm' '11@ 3,000(kgm@ rpm)'
 '142Nm@ 4000rpm' '145Nm@ 4100rpm' '320Nm@ 1500-2800rpm'
 '123Nm@ 1000-2500rpm' '218Nm@ 1400-2600rpm' '510@ 1600-2400'
 '220Nm@ 1500-2750rpm' '380Nm@ 2000rpm' '104Nm@ 3100rpm' '292Nm@ 2000rpm'
 '20@ 3,750(kgm@ rpm)' '46.5@ 1,400-2,800(kgm@ rpm)' '380Nm@ 2500rpm'
 '15@ 3,800(kgm@ rpm)' '136Nm@ 4250rpm' '228Nm@ 4400rpm' '149Nm@ 4500rpm'
 '187Nm@ 2500rpm' '146Nm@ 3400rpm' '8.6@ 3,500(kgm@ rpm)'
 '219.7Nm@ 1750-2750rpm' '190Nm@ 2000-3000' '450Nm@ 2000rpm'
 '300Nm@ 2000rpm' '230Nm@ 1800-2000rpm' '42@ 2,000(kgm@ rpm)'
 '110Nm@ 3000-4300rpm' '110(11.2)@ 4800' '330Nm@ 1800rpm'
 '225Nm@ 1500-2500rpm' '380Nm@ 1750-2750rpm' '28.3@ 1,700-2,200(kgm@ rpm)'
 '259.88Nm@ 1900-2750rpm' '580Nm@ 1400-3250rpm' '400 Nm /2000 rpm'
 '127Nm@ 3500rpm' '300Nm@ 1500-2500rpm' '132.3Nm@ 4000rpm'
 '113nm@ 4400rpm' '151NM@ 4850rpm' '153Nm@ 3750-3800rpm'
 '10.7@ 2,500(kgm@ rpm)' '124.6Nm@ 3500rpm' '78Nm@ 3500rpm'
 '219.9Nm@ 1750-2750rpm' '420.7Nm@ 1800-2500rpm' '130Nm@ 3000rpm'
 '424Nm@ 2000rpm' '130@ 2500(kgm@ rpm)' '99.8Nm@ 2700rpm'
 '113Nm@ 4,500rpm' '11.2@ 4,400(kgm@ rpm)' '240Nm@ 1850rpm'
 '16.1@ 4,200(kgm@ rpm)' '320Nm@ 1750-2700rpm' '115Nm@ 4500rpm'
 '245Nm@ 4000rpm' '321Nm@ 1600-2400rpm' '619Nm@ 1600-2400rpm'
 '380Nm@ 1750-3000rpm' '560Nm@ 1500rpm' '230Nm@ 1500-2250rpm'
 '90Nm@ 2650rpm' '260Nm@ 1800-2200rpm' '600Nm@ 2000rpm'
 '259.87nm@ 1500-3000rpm' '16.6@ 4,500(kgm@ rpm)' '219.66NM@ 1500-2750rpm'
 '12.5@ 3,000(kgm@ rpm)' '620Nm@ 1500-2500rpm' '250Nm@ 1500-4500rpm'
 '14.9@ 3,400(kgm@ rpm)' '25.5@ 1,900(kgm@ rpm)' '33.7@ 1,800(kgm@ rpm)'
 '285Nm@ 2400-4000rpm' '10.7@ 2,600(kgm@ rpm)' '250Nm@ 1000-2000rpm'
 '240Nm@ 1750rpm' '226Nm@ 4400rpm' '510Nm@ 1600-2800rpm'
 '259.87NM@ 1500-3000rpm' '155 Nm at 1600-2800 rpm' '240Nm@ 2000rpm'
 '103Nm@ 4500rpm' '13.5@ 4,800(kgm@ rpm)' '400Nm@ 1750-2750rpm'
 '175Nm@ 1500-4100rpm' '72.9Nm@ 2250rpm' '135.4Nm@ 2500' '245Nm@ 5000rpm'
 '57Nm@ 2500rpm' '96Nm@ 2500rpm' '215nm@ 1750-2500rpm'
 '10.4@ 3,200(kgm@ rpm)' '128Nm@ 3100rpm' '102Nm@ 2600rpm'
 '131Nm@ 4400rpm' '11.4@ 4,000(kgm@ rpm)' '250Nm@ 4250rpm'
 '343Nm@ 1600-2800rpm' '185Nm@ 1750-2750rpm' '12@ 2500(kgm@ rpm)'
 '12.4@ 2,600(kgm@ rpm)' '170Nm@ 4200rpm' '176Nm@ 1500rpm'
 '380NM@ 1800-2800rpm' '250Nm@ 1600-2000rpm' '24.5@ 3,500-4,500(kgm@ rpm)'
 '22.9@ 1,950-4,700(kgm@ rpm)' '113Nm@ 4400rpm' '121Nm@ 2800rpm'
 '210 / 1900' '250Nm@ 1250-5000rpm' '400Nm@ 175-2750rpm'
 '350Nm@ 1500-3500rpm' '175nm@ 1750-4000rpm' '115@ 2500(kgm@ rpm)'
 '110Nm@ 4500rpm' '190Nm@ 2000-3000rpm' '106Nm@ 2200rpm'
 '21.4@ 1,750-4,600(kgm@ rpm)' '96Nm@ 3000rpm' '23.6@ 4,250(kgm@ rpm)'
 '11.3kgm@ 4700rpm' '450Nm@ 1750-2500rpm' '35.7@ 1,750-3,000(kgm@ rpm)'
 '6@ 2,500(kgm@ rpm)' nan '13.9 kgm at 4200 rpm' '320Nm@ 1400-4100rpm'
 '150Nm@ 1700-4500rpm' '113.8Nm@ 4000rpm' '110@ 3,000(kgm@ rpm)'
 '151Nm@ 2400rpm' '62Nm@ 2500rpm' '18@ 1,600-2,200(kgm@ rpm)'
 '83Nm@ 3000rpm' '124.5Nm@ 3500rpm' '20@ 4,700(kgm@ rpm)'
 '300Nm@ 1600-4000rpm' '171.6Nm@ 1500-4000rpm' '21.4@ 1,900(kgm@ rpm)'
 '190@ 21,800(kgm@ rpm)' '5.7@ 2,500(kgm@ rpm)' '88.4Nm@ 4200rpm'
 '250 Nm at 1,500-3,000 rpm' '340nm@ 1750-3000rpm'
 '36.6@ 1,750-2,500(kgm@ rpm)' '12.5kgm@ 3500rpm' '6.1@ 3,000(kgm@ rpm)'
 '110Nm@ 4000rpm' '350nm@ 1800-2600rpm' '175nm@ 1500-4100rpm'
 '4.8kgm@ 3000rpm' '355Nm@ 4500rpm' '51@ 1,750-3,000(kgm@ rpm)'
 '119Nm@ 4250rpm' '410Nm@ 1600-2800rpm' '174Nm@ 4300rpm' '99.1Nm@ 4500rpm'
 '385Nm@ 1600-2500rpm' '180 Nm at 2000rpm' '190 Nm at 1750 rpm'
 '53@ 2,000-2,750(kgm@ rpm)' '360Nm@ 1400-2600rpm' '420Nm@ 2000rpm'
 '124Nm@ 3500rpm' '17.5@ 4,300(kgm@ rpm)' '360Nm@ 2000rpm'
 '145Nm@ 3750rpm' '85Nm@ 3500rpm' '190Nm@ 4200rpm' '190 Nm at 2000rpm'
 '13.5@ 2500(kgm@ rpm)' '250nm@ 1500-3000rpm' '159.8Nm@ 1500-2750rpm'
 '500Nm@ 2000rpm' '333Nm@ 1600-3200rpm' '400nm@ 2800rpm'
 '33@ 2,000-2,680(kgm@ rpm)' '10.2@ 2,600(kgm@ rpm)' '480Nm'
 '190Nm@ 4300rpm' '320Nm@ 1800-2800rpm' '380Nm@ 1750rpm'
 '250.06nm@ 1500-2750rpm' '90nm@ 3500rpm' '190Nm@ 3700rpm'
 '436.4Nm@ 1800-2500rpm' '96  Nm at 3000  rpm ']
count               7906
unique               441
top       190Nm@ 2000rpm
freq                 530
Name: torque, dtype: object
In [13]:
def to_float_num(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    m = re.search(r"(\d+(\.\d+)?)", s)
    return float(m.group(1)) if m else np.nan

def parse_torque(x):

    if pd.isna(x):
        return (np.nan, np.nan)
    s = str(x).lower().replace(" ", "")
    # torque number
    torque_nm = None
    m_nm = re.search(r"(\d+(\.\d+)?)\s*nm", s)
    if m_nm:
        torque_nm = float(m_nm.group(1))
    else:
        # sometimes no 'nm' unit but still has number
        m_any = re.search(r"(\d+(\.\d+)?)", s)
        torque_nm = float(m_any.group(1)) if m_any else np.nan

    # rpm after @
    torque_rpm = np.nan
    m_rpm = re.search(r"@(\d+(\.\d+)?)", s)
    if m_rpm:
        torque_rpm = float(m_rpm.group(1))
    else:
        m_rpm2 = re.search(r"(\d+(\.\d+)?)rpm", s)
        if m_rpm2:
            torque_rpm = float(m_rpm2.group(1))

    return torque_nm, torque_rpm

def clean_car_df(df):
    df = df.copy()

    # Strip column names
    df.columns = [c.strip() for c in df.columns]

    # Standardize some object columns
    for col in ["name", "fuel", "seller_type", "transmission", "owner"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].replace({"nan": np.nan, "None": np.nan, "": np.nan})

    # Convert year, seats to numeric
    for col in ["year", "seats"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # selling_price, km_driven to numeric
    for col in ["selling_price", "km_driven"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # mileage: e.g., "17.0 kmpl", "23.4 km/kg"
    if "mileage" in df.columns:
        df["mileage_num"] = df["mileage"].apply(to_float_num)

    # engine: e.g., "1248 CC"
    if "engine" in df.columns:
        df["engine_cc"] = df["engine"].apply(to_float_num)

    # max_power: e.g., "74 bhp"
    if "max_power" in df.columns:
        df["max_power_bhp"] = df["max_power"].apply(to_float_num)

    # torque: create torque_nm and torque_rpm
    if "torque" in df.columns:
        torque_parsed = df["torque"].apply(parse_torque)
        df["torque_nm"] = torque_parsed.apply(lambda t: t[0])
        df["torque_rpm"] = torque_parsed.apply(lambda t: t[1])

    return df

df = clean_car_df(df)
display(df.head())
name year selling_price km_driven fuel seller_type transmission owner mileage engine max_power torque seats mileage_num engine_cc max_power_bhp torque_nm torque_rpm
0 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual Manual First Owner 23.4 kmpl 1248 CC 74 bhp 190Nm@ 2000rpm 5.0 23.40 1248.0 74.00 190.0 2000.0
1 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual Manual Second Owner 21.14 kmpl 1498 CC 103.52 bhp 250Nm@ 1500-2500rpm 5.0 21.14 1498.0 103.52 250.0 1500.0
2 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual Manual Third Owner 17.7 kmpl 1497 CC 78 bhp 12.7@ 2,700(kgm@ rpm) 5.0 17.70 1497.0 78.00 12.7 2.0
3 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual Manual First Owner 23.0 kmpl 1396 CC 90 bhp 22.4 kgm at 1750-2750rpm 5.0 23.00 1396.0 90.00 22.4 2750.0
4 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual Manual First Owner 16.1 kmpl 1298 CC 88.2 bhp 11.5@ 4,500(kgm@ rpm) 5.0 16.10 1298.0 88.20 11.5 4.0
In [14]:
df["torque_nm"].fillna(df["torque_nm"].median(), inplace=True)
df["torque_rpm"].fillna(df["torque_rpm"].median(), inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\3206175945.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["torque_nm"].fillna(df["torque_nm"].median(), inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\3206175945.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["torque_rpm"].fillna(df["torque_rpm"].median(), inplace=True)
In [15]:
df = df.copy()
df.drop(columns=["mileage", "engine", "max_power", "torque"], inplace=True, errors="ignore")
In [16]:
def missing_values_analysis(df):
    
    missing = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Data_Type': df.dtypes
    })
    missing = missing[missing['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

    print("Missing Values Summary:")
    if len(missing) > 0:
        print(missing)
    else:
        print("No missing values found")
    
    print(f"\nTotal missing values: {df.isnull().sum().sum()}")
    print(f"Percentage of missing data: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")
In [17]:
missing_values_analysis(df)
Missing Values Summary:
                      Column  Missing_Count  Missing_Percentage Data_Type
max_power_bhp  max_power_bhp              1            0.012647   float64

Total missing values: 1
Percentage of missing data: 0.00%
In [18]:
def duplicate_analysis(df):
    duplicates = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicates}")
    print(f"Percentage of duplicates: {(duplicates / len(df)) * 100:.2f}%")
    
    if duplicates > 0:
        print("\nDuplicate Rows:")
        print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)))
In [19]:
duplicate_analysis(df)
Number of duplicate rows: 1189
Percentage of duplicates: 15.04%

Duplicate Rows:
                                      name  year  selling_price  km_driven  \
1977  Audi Q3 2.0 TDI Quattro Premium Plus  2017        2825000      22000   
7324  Audi Q3 2.0 TDI Quattro Premium Plus  2017        2825000      22000   
2129               Audi Q5 3.0 TDI Quattro  2014        1850000      76131   
7775               Audi Q5 3.0 TDI Quattro  2014        1850000      76131   
131             Audi Q5 35TDI Premium Plus  2018        3975000      31800   
...                                    ...   ...            ...        ...   
6693        Volvo XC40 D4 Inscription BSIV  2019        3800000      20000   
6963        Volvo XC40 D4 Inscription BSIV  2019        3800000      20000   
7862        Volvo XC40 D4 Inscription BSIV  2019        3800000      20000   
145                 Volvo XC40 D4 R-Design  2018        3400000      22000   
3251                Volvo XC40 D4 R-Design  2018        3400000      22000   

        fuel seller_type transmission        owner  seats  mileage_num  \
1977  Diesel      Dealer    Automatic  First Owner    5.0        15.73   
7324  Diesel      Dealer    Automatic  First Owner    5.0        15.73   
2129  Diesel  Individual    Automatic  First Owner    5.0        13.22   
7775  Diesel  Individual    Automatic  First Owner    5.0        13.22   
131   Diesel      Dealer    Automatic  First Owner    5.0        17.01   
...      ...         ...          ...          ...    ...          ...   
6693  Diesel  Individual    Automatic  First Owner    5.0        18.00   
6963  Diesel  Individual    Automatic  First Owner    5.0        18.00   
7862  Diesel  Individual    Automatic  First Owner    5.0        18.00   
145   Diesel      Dealer    Automatic  First Owner    5.0        18.00   
3251  Diesel      Dealer    Automatic  First Owner    5.0        18.00   

      engine_cc  max_power_bhp  torque_nm  torque_rpm  
1977     1968.0         174.33      380.0      1750.0  
7324     1968.0         174.33      380.0      1750.0  
2129     2967.0         241.40      580.0      1400.0  
7775     2967.0         241.40      580.0      1400.0  
131      1968.0         188.00      400.0      1750.0  
...         ...            ...        ...         ...  
6693     1969.0         190.00      400.0      2000.0  
6963     1969.0         190.00      400.0      2000.0  
7862     1969.0         190.00      400.0      2000.0  
145      1969.0         190.00      400.0      2000.0  
3251     1969.0         190.00      400.0      2000.0  

[1801 rows x 14 columns]
In [20]:
def unique_values_analysis(df):
    
    unique_info = pd.DataFrame({
        'Column': df.columns,
        'Unique_Count': [df[col].nunique() for col in df.columns],
        'Unique_Percentage': [(df[col].nunique() / len(df)) * 100 for col in df.columns],
        'Sample_Values': [df[col].unique()[:5].tolist() for col in df.columns]
    })
    
    print("\n", unique_info)
    
    # Detailed view for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print("\n" + "-"*80)
    print("Categorical Columns - Value Counts:")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())
In [21]:
unique_values_analysis(df)
            Column  Unique_Count  Unique_Percentage  \
0            name          1983          25.079044   
1            year            27           0.341470   
2   selling_price           670           8.473504   
3       km_driven           898          11.357025   
4            fuel             4           0.050588   
5     seller_type             3           0.037941   
6    transmission             2           0.025294   
7           owner             5           0.063235   
8           seats             9           0.113823   
9     mileage_num           381           4.818515   
10      engine_cc           121           1.530290   
11  max_power_bhp           318           4.021753   
12      torque_nm           244           3.085873   
13     torque_rpm            56           0.708233   

                                        Sample_Values  
0   [Maruti Swift Dzire VDI, Skoda Rapid 1.5 TDI A...  
1                      [2014, 2006, 2010, 2007, 2017]  
2            [450000, 370000, 158000, 225000, 130000]  
3             [145500, 120000, 140000, 127000, 45000]  
4                          [Diesel, Petrol, LPG, CNG]  
5              [Individual, Dealer, Trustmark Dealer]  
6                                 [Manual, Automatic]  
7   [First Owner, Second Owner, Third Owner, Fourt...  
8                           [5.0, 4.0, 7.0, 8.0, 6.0]  
9                     [23.4, 21.14, 17.7, 23.0, 16.1]  
10           [1248.0, 1498.0, 1497.0, 1396.0, 1298.0]  
11                   [74.0, 103.52, 78.0, 90.0, 88.2]  
12                   [190.0, 250.0, 12.7, 22.4, 11.5]  
13                 [2000.0, 1500.0, 2.0, 2750.0, 4.0]  

--------------------------------------------------------------------------------
Categorical Columns - Value Counts:

name:
name
Maruti Swift Dzire VDI                          129
Maruti Alto 800 LXI                              82
Maruti Alto LXi                                  71
BMW X4 M Sport X xDrive20d                       62
Maruti Swift VDI                                 61
                                               ... 
Honda Jazz 1.2 S AT i VTEC                        1
Hyundai i20 Active 1.2 SX with AVN                1
Ambassador CLASSIC 1500 DSL AC                    1
Tata Harrier XZ Dark Edition BSIV                 1
Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV      1
Name: count, Length: 1983, dtype: int64

fuel:
fuel
Diesel    4299
Petrol    3520
CNG         53
LPG         35
Name: count, dtype: int64

seller_type:
seller_type
Individual          6564
Dealer              1107
Trustmark Dealer     236
Name: count, dtype: int64

transmission:
transmission
Manual       6866
Automatic    1041
Name: count, dtype: int64

owner:
owner
First Owner             5215
Second Owner            2017
Third Owner              510
Fourth & Above Owner     160
Test Drive Car             5
Name: count, dtype: int64
In [22]:
def distribution_analysis(df):
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Distribution metrics
    print("\nDistribution Metrics:")
    for col in numerical_cols:
        print(f"\n{col}:")
        print(f"  Skewness: {skew(df[col].dropna()):.4f}")
        print(f"  Kurtosis: {kurtosis(df[col].dropna()):.4f}")
        
        # Normality test
        if len(df[col].dropna()) >= 3:
            stat, p_value = stats.shapiro(df[col].dropna().sample(min(5000, len(df[col].dropna()))))
            print(f"  Shapiro-Wilk test p-value: {p_value:.4f}")
            print(f"  Normal distribution: {'Yes' if p_value > 0.05 else 'No'}")
    
    # Histograms
    n_cols = len(numerical_cols)
    n_rows = (n_cols + 2) // 3
    
    fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_cols > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Box plots
    fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_cols > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].boxplot(df[col].dropna())
        axes[idx].set_title(f'Box Plot of {col}')
        axes[idx].set_ylabel(col)
        axes[idx].grid(True, alpha=0.3)
    
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

distribution_analysis(df)
Distribution Metrics:

year:
  Skewness: -0.9970
  Kurtosis: 1.2795
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

selling_price:
  Skewness: 4.1562
  Kurtosis: 20.6221
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

km_driven:
  Skewness: 11.3406
  Kurtosis: 388.4493
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

seats:
  Skewness: 1.9695
  Kurtosis: 3.7963
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

mileage_num:
  Skewness: -0.1426
  Kurtosis: 0.5687
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

engine_cc:
  Skewness: 1.1351
  Kurtosis: 0.7313
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

max_power_bhp:
  Skewness: 1.6394
  Kurtosis: 3.7985
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

torque_nm:
  Skewness: 1.0602
  Kurtosis: 1.9373
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No

torque_rpm:
  Skewness: 0.1391
  Kurtosis: -0.9095
  Shapiro-Wilk test p-value: 0.0000
  Normal distribution: No
No description has been provided for this image
No description has been provided for this image
Selling Price¶
  • Highly right-skewed.
  • Majority of vehicles are in lower price ranges.
  • Few luxury cars create extreme high-price outliers.

Implication: Log transformation may improve model performance.

Kilometers Driven¶
  • Distribution shows most vehicles in mid-range mileage.
  • Long tail exists for high-mileage vehicles.

Implication: Higher mileage often lowers price.

Engine Capacity & Power¶
  • Engine sizes cluster around common passenger vehicle ranges.
  • Power distribution shows majority of cars in moderate power range.
  • High-performance vehicles form outliers.

Implication: Engine size and power strongly influence price.

Mileage Efficiency¶
  • Mileage values vary across fuel and engine types.

Implication: Higher mileage efficiency may influence resale value but not as strongly as engine power.

In [23]:
def outlier_detection(df):

    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print("\nIQR Method (Outliers beyond 1.5 * IQR):")
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percentage = (outlier_count / len(df)) * 100
        
        print(f"\n{col}:")
        print(f"  Lower bound: {lower_bound:.2f}")
        print(f"  Upper bound: {upper_bound:.2f}")
        print(f"  Outliers count: {outlier_count}")
        print(f"  Outliers percentage: {outlier_percentage:.2f}%")
    
    print("Z-Score Method (|Z-score| > 3):")
    for col in numerical_cols:
        z_scores = np.abs(stats.zscore(df[col].dropna()))
        outliers = np.sum(z_scores > 3)
        outlier_percentage = (outliers / len(df[col].dropna())) * 100
        
        print(f"\n{col}:")
        print(f"  Outliers count: {outliers}")
        print(f"  Outliers percentage: {outlier_percentage:.2f}%")
In [24]:
outlier_detection(df)
IQR Method (Outliers beyond 1.5 * IQR):

year:
  Lower bound: 2004.50
  Upper bound: 2024.50
  Outliers count: 166
  Outliers percentage: 2.10%

selling_price:
  Lower bound: -360000.00
  Upper bound: 1320000.00
  Outliers count: 598
  Outliers percentage: 7.56%

km_driven:
  Lower bound: -56125.00
  Upper bound: 186875.00
  Outliers count: 193
  Outliers percentage: 2.44%

seats:
  Lower bound: 5.00
  Upper bound: 5.00
  Outliers count: 1653
  Outliers percentage: 20.91%

mileage_num:
  Lower bound: 8.47
  Upper bound: 30.63
  Outliers count: 24
  Outliers percentage: 0.30%

engine_cc:
  Lower bound: 619.50
  Upper bound: 2159.50
  Outliers count: 1183
  Outliers percentage: 14.96%

max_power_bhp:
  Lower bound: 17.12
  Upper bound: 152.93
  Outliers count: 564
  Outliers percentage: 7.13%

torque_nm:
  Lower bound: -50.50
  Upper bound: 353.50
  Outliers count: 404
  Outliers percentage: 5.11%

torque_rpm:
  Lower bound: -1325.00
  Upper bound: 6875.00
  Outliers count: 0
  Outliers percentage: 0.00%
Z-Score Method (|Z-score| > 3):

year:
  Outliers count: 78
  Outliers percentage: 0.99%

selling_price:
  Outliers count: 229
  Outliers percentage: 2.90%

km_driven:
  Outliers count: 70
  Outliers percentage: 0.89%

seats:
  Outliers count: 102
  Outliers percentage: 1.29%

mileage_num:
  Outliers count: 24
  Outliers percentage: 0.30%

engine_cc:
  Outliers count: 78
  Outliers percentage: 0.99%

max_power_bhp:
  Outliers count: 90
  Outliers percentage: 1.14%

torque_nm:
  Outliers count: 53
  Outliers percentage: 0.67%

torque_rpm:
  Outliers count: 0
  Outliers percentage: 0.00%
In [25]:
def correlation_analysis(df):
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Pearson correlation
    print("\nPearson Correlation Matrix:")
    corr_matrix = df[numerical_cols].corr()
    print(corr_matrix)
    
    # Heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Heatmap (Pearson)')
    plt.tight_layout()
    plt.show()
    
    # High correlations
    print("High Correlations (|r| > 0.7):")
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.7:
                high_corr.append({
                    'Feature 1': corr_matrix.columns[i],
                    'Feature 2': corr_matrix.columns[j],
                    'Correlation': corr_matrix.iloc[i, j]
                })
    
    if high_corr:
        print(pd.DataFrame(high_corr))
    else:
        print("No high correlations found.")
    
    # Spearman correlation
    print("Spearman Correlation Matrix:")
    spearman_corr = df[numerical_cols].corr(method='spearman')
    print(spearman_corr)
In [26]:
correlation_analysis(df)
Pearson Correlation Matrix:
                   year  selling_price  km_driven     seats  mileage_num  \
year           1.000000       0.412268  -0.428434 -0.009144     0.329145   
selling_price  0.412268       1.000000  -0.222196  0.041358    -0.126054   
km_driven     -0.428434      -0.222196   1.000000  0.227336    -0.173073   
seats         -0.009144       0.041358   0.227336  1.000000    -0.452085   
mileage_num    0.329145      -0.126054  -0.173073 -0.452085     1.000000   
engine_cc      0.018848       0.455734   0.205914  0.610309    -0.575831   
max_power_bhp  0.226598       0.749674  -0.038159  0.191999    -0.374621   
torque_nm      0.289279       0.619784  -0.003332  0.274297    -0.171046   
torque_rpm     0.157434      -0.132285  -0.275886 -0.315482     0.092260   

               engine_cc  max_power_bhp  torque_nm  torque_rpm  
year            0.018848       0.226598   0.289279    0.157434  
selling_price   0.455734       0.749674   0.619784   -0.132285  
km_driven       0.205914      -0.038159  -0.003332   -0.275886  
seats           0.610309       0.191999   0.274297   -0.315482  
mileage_num    -0.575831      -0.374621  -0.171046    0.092260  
engine_cc       1.000000       0.703975   0.628822   -0.429501  
max_power_bhp   0.703975       1.000000   0.770206   -0.216084  
torque_nm       0.628822       0.770206   1.000000   -0.371573  
torque_rpm     -0.429501      -0.216084  -0.371573    1.000000  
No description has been provided for this image
High Correlations (|r| > 0.7):
       Feature 1      Feature 2  Correlation
0  selling_price  max_power_bhp     0.749674
1      engine_cc  max_power_bhp     0.703975
2  max_power_bhp      torque_nm     0.770206
Spearman Correlation Matrix:
                   year  selling_price  km_driven     seats  mileage_num  \
year           1.000000       0.699282  -0.622829 -0.010459     0.304563   
selling_price  0.699282       1.000000  -0.348455  0.263143    -0.029221   
km_driven     -0.622829      -0.348455   1.000000  0.228042    -0.163102   
seats         -0.010459       0.263143   0.228042  1.000000    -0.439428   
mileage_num    0.304563      -0.029221  -0.163102 -0.439428     1.000000   
engine_cc     -0.002641       0.513526   0.241342  0.519604    -0.462244   
max_power_bhp  0.221005       0.671143  -0.026260  0.283206    -0.345271   
torque_nm      0.273297       0.654343   0.058245  0.346184    -0.085568   
torque_rpm     0.113698      -0.218231  -0.323879 -0.401392     0.150710   

               engine_cc  max_power_bhp  torque_nm  torque_rpm  
year           -0.002641       0.221005   0.273297    0.113698  
selling_price   0.513526       0.671143   0.654343   -0.218231  
km_driven       0.241342      -0.026260   0.058245   -0.323879  
seats           0.519604       0.283206   0.346184   -0.401392  
mileage_num    -0.462244      -0.345271  -0.085568    0.150710  
engine_cc       1.000000       0.740780   0.708355   -0.546173  
max_power_bhp   0.740780       1.000000   0.704594   -0.285792  
torque_nm       0.708355       0.704594   1.000000   -0.466410  
torque_rpm     -0.546173      -0.285792  -0.466410    1.000000  

Key correlations observed:¶

Selling price strongly correlates with:¶
  • Engine capacity
  • Maximum power
  • Manufacturing year
Negative correlation observed with:¶
  • Kilometers driven

Implication: Cars with larger engines, higher power, and newer manufacturing years tend to be priced higher.

In [27]:
def categorical_analysis(df):

    new_category_cols = ['owner', 'fuel', 'seller_type', 'transmission']
    
    # keep only columns that exist in df
    categorical_cols = [c for c in new_category_cols if c in df.columns]

    for col in categorical_cols:
        plt.figure(figsize=(10, 6))
        df[col].value_counts().plot(kind='bar', edgecolor='black', alpha=0.7)

        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
In [28]:
categorical_analysis(df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fuel Type¶
  • Diesel and Petrol vehicles dominate the dataset.
  • Alternative fuels appear less frequently.
  • Price patterns vary slightly by fuel type.
Transmission¶
  • Manual vehicles are more common.
  • Automatic vehicles often command higher prices.
Ownership¶
  • First-owner cars typically have higher resale value.
  • Cars with multiple owners tend to sell cheaper.
Seller Type¶
  • Dealer listings sometimes show higher prices compared to individuals.
In [29]:
def bivariate_analysis(df, target_col='selling_price'):

    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    new_category_cols = ['owner', 'fuel', 'seller_type', 'transmission']
    # keep only columns that exist in df
    categorical_cols = [c for c in new_category_cols if c in df.columns]
    
    # Numerical vs Target
    print(f"\nCorrelation with {target_col}:")
    if target_col in numerical_cols:
        correlations = df[numerical_cols].corrwith(df[target_col]).sort_values(ascending=False)
        print(correlations)
        
        # Scatter plots
        other_numerical = [col for col in numerical_cols if col != target_col]
        n_cols = len(other_numerical)
        n_rows = (n_cols + 2) // 3
        
        fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
        axes = axes.flatten() if n_cols > 1 else [axes]
        
        for idx, col in enumerate(other_numerical):
            axes[idx].scatter(df[col], df[target_col], alpha=0.5)
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel(target_col)
            axes[idx].set_title(f'{col} vs {target_col}')
            axes[idx].grid(True, alpha=0.3)
        
        for idx in range(len(other_numerical), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    # Categorical vs Target
    print(f"Categorical features vs {target_col}:")
    
    for col in categorical_cols:
        print(f"\n{col}:")
        group_stats = df.groupby(col)[target_col].agg(['mean', 'median', 'std', 'count'])
        print(group_stats)
        
        # Box plot
        plt.figure(figsize=(12, 6))
        df.boxplot(column=target_col, by=col, figsize=(12, 6))
        plt.title(f'{target_col} by {col}')
        plt.suptitle('')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
In [30]:
bivariate_analysis(df)
Correlation with selling_price:
selling_price    1.000000
max_power_bhp    0.749674
torque_nm        0.619784
engine_cc        0.455734
year             0.412268
seats            0.041358
mileage_num     -0.126054
torque_rpm      -0.132285
km_driven       -0.222196
dtype: float64
No description has been provided for this image
Categorical features vs selling_price:

owner:
                              mean     median           std  count
owner                                                             
First Owner           7.899799e+05   550000.0  9.362189e+05   5215
Fourth & Above Owner  2.331968e+05   185000.0  1.586916e+05    160
Second Owner          4.010442e+05   320000.0  3.273773e+05   2017
Test Drive Car        4.403800e+06  5923000.0  2.510598e+06      5
Third Owner           2.931873e+05   230499.5  2.303804e+05    510
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image
fuel:
                 mean    median            std  count
fuel                                                 
CNG     313415.037736  330000.0  105577.722776     53
Diesel  802450.282624  550000.0  915753.684639   4299
LPG     210885.714286  200000.0   79828.039976     35
Petrol  472665.203125  350000.0  631712.877861   3520
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image
seller_type:
                          mean    median           std  count
seller_type                                                  
Dealer            1.459910e+06  700000.0  1.576638e+06   1107
Individual        5.076407e+05  400000.0  4.872636e+05   6564
Trustmark Dealer  8.018390e+05  650000.0  5.080779e+05    236
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image
transmission:
                      mean     median           std  count
transmission                                              
Automatic     1.882972e+06  1100000.0  1.641287e+06   1041
Manual        4.627634e+05   409999.0  2.975859e+05   6866
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image

Key trends:

  • Selling price decreases as kilometers driven increases.
  • Price increases with engine size and power.
  • Newer vehicles have significantly higher prices.
  • Automatic transmission vehicles often cost more.
In [31]:
def pairplot_analysis(df, sample_size=2000):
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Sample data if too large
    if len(df) > sample_size:
        df_sample = df[numerical_cols].sample(sample_size)
        print(f"\nUsing sample of {sample_size} rows for pair plot")
    else:
        df_sample = df[numerical_cols]
    
    print("\nGenerating pair plot...")
    sns.pairplot(df_sample, diag_kind='kde', plot_kws={'alpha': 0.6})
    plt.tight_layout()
    plt.show()
In [32]:
pairplot_analysis(df)
Using sample of 2000 rows for pair plot

Generating pair plot...
No description has been provided for this image
In [33]:
def time_series_analysis(df, time_col='year'):
    
    if time_col in df.columns:
        print(f"\nAnalyzing trends by {time_col}:")
        
        numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        numerical_cols = [col for col in numerical_cols if col != time_col]
        
        for col in numerical_cols:
            yearly_stats = df.groupby(time_col)[col].agg(['mean', 'median', 'count'])
            print(f"\n{col} by {time_col}:")
            print(yearly_stats)
            
            # Line plot
            plt.figure(figsize=(12, 6))
            yearly_stats['mean'].plot(marker='o', linewidth=2)
            plt.title(f'Average {col} by {time_col}')
            plt.xlabel(time_col)
            plt.ylabel(f'Average {col}')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
In [34]:
time_series_analysis(df)
Analyzing trends by year:

selling_price by year:
              mean    median  count
year                               
1994  7.200000e+04   72000.0      2
1995  5.500000e+04   55000.0      1
1996  4.100000e+04   41000.0      2
1997  8.611100e+04   50000.0      9
1998  5.788889e+04   58000.0      9
1999  7.692857e+04   62500.0     14
2000  7.668212e+04   77500.0     16
2001  4.650000e+04   45000.0      6
2002  1.037894e+05   75000.0     19
2003  1.015729e+05   85000.0     37
2004  1.094089e+05   90000.0     51
2005  1.437566e+05  110000.0     76
2006  1.704622e+05  140000.0    102
2007  1.791065e+05  142000.0    173
2008  2.055598e+05  170000.0    201
2009  2.288095e+05  180000.0    231
2010  2.740106e+05  220000.0    375
2011  3.220964e+05  265000.0    570
2012  3.531433e+05  300000.0    621
2013  4.606646e+05  389500.0    668
2014  5.166757e+05  450500.0    620
2015  5.967703e+05  525000.0    775
2016  7.003177e+05  550000.0    856
2017  8.901930e+05  645000.0   1010
2018  9.581836e+05  715000.0    806
2019  1.776986e+06  779000.0    583
2020  8.852702e+05  590000.0     74
No description has been provided for this image
km_driven by year:
               mean    median  count
year                                
1994   80000.000000   80000.0      2
1995   70000.000000   70000.0      1
1996   51000.000000   51000.0      2
1997   99487.555556  100000.0      9
1998   71111.111111   70000.0      9
1999   89094.285714   75000.0     14
2000  106821.562500   90000.0     16
2001   48184.666667   38554.0      6
2002   94492.894737   80000.0     19
2003   95697.081081   90000.0     37
2004   98883.274510   96000.0     51
2005  104011.894737   90000.0     76
2006  108693.568627  100000.0    102
2007  115765.884393  100000.0    173
2008   99125.134328  100000.0    201
2009   98978.813853   90000.0    231
2010  102048.992000   91000.0    375
2011   96860.884211   90000.0    570
2012   94988.974235   90000.0    621
2013   85522.365269   79328.0    668
2014   78887.977419   72000.0    620
2015   71672.080000   63063.0    775
2016   58877.210280   56494.0    856
2017   46525.788119   40000.0   1010
2018   33916.936725   30000.0    806
2019   16307.900515   10500.0    583
2020   20525.716216    7350.0     74
No description has been provided for this image
seats by year:
          mean  median  count
year                         
1994  4.500000     4.5      2
1995  4.000000     4.0      1
1996  4.000000     4.0      2
1997  4.666667     4.0      9
1998  4.555556     5.0      9
1999  4.500000     4.5     14
2000  5.187500     5.0     16
2001  4.333333     4.0      6
2002  5.210526     5.0     19
2003  5.081081     5.0     37
2004  4.862745     5.0     51
2005  5.394737     5.0     76
2006  5.431373     5.0    102
2007  5.381503     5.0    173
2008  5.492537     5.0    201
2009  5.367965     5.0    231
2010  5.416000     5.0    375
2011  5.445614     5.0    570
2012  5.431562     5.0    621
2013  5.600299     5.0    668
2014  5.601613     5.0    620
2015  5.598710     5.0    775
2016  5.463785     5.0    856
2017  5.312871     5.0   1010
2018  5.277916     5.0    806
2019  5.204117     5.0    583
2020  5.243243     5.0     74
No description has been provided for this image
mileage_num by year:
           mean  median  count
year                          
1994  14.450000   14.45      2
1995  16.100000   16.10      1
1996  16.100000   16.10      2
1997  16.066667   16.10      9
1998  15.933333   16.10      9
1999  15.964286   16.10     14
2000  16.368750   16.10     16
2001  16.700000   16.10      6
2002  16.327895   16.10     19
2003  15.778378   16.10     37
2004  15.952941   16.10     51
2005  15.943421   16.10     76
2006  16.174020   16.80    102
2007  16.614277   17.20    173
2008  16.542637   17.50    201
2009  17.011126   17.80    231
2010  17.504453   18.00    375
2011  18.026281   18.60    570
2012  19.102206   19.40    621
2013  19.296243   20.36    668
2014  19.722645   20.36    620
2015  20.136452   20.45    775
2016  20.293014   20.14    856
2017  20.864713   21.04   1010
2018  21.024715   21.13    806
2019  19.918491   19.50    583
2020  19.833108   20.89     74
No description has been provided for this image
engine_cc by year:
             mean  median  count
year                            
1994  1395.500000  1395.5      2
1995   796.000000   796.0      1
1996   796.000000   796.0      2
1997   929.444444   796.0      9
1998   950.777778   796.0      9
1999  1153.428571   894.5     14
2000  1011.062500   796.0     16
2001   828.833333   796.0      6
2002  1187.000000   993.0     19
2003  1302.675676   998.0     37
2004  1252.490196   998.0     51
2005  1421.947368  1341.0     76
2006  1456.656863  1405.0    102
2007  1461.387283  1396.0    173
2008  1472.636816  1396.0    201
2009  1420.930736  1248.0    231
2010  1436.133333  1248.0    375
2011  1531.138596  1397.5    570
2012  1465.280193  1396.0    621
2013  1528.327844  1392.0    668
2014  1517.109677  1364.0    620
2015  1494.689032  1373.0    775
2016  1434.808411  1248.0    856
2017  1419.234653  1248.0   1010
2018  1404.681141  1248.0    806
2019  1492.324185  1199.0    583
2020  1360.054054  1197.0     74
No description has been provided for this image
max_power_bhp by year:
            mean  median  count
year                           
1994   44.500000  44.500      2
1995   37.000000  37.000      1
1996   37.000000  37.000      2
1997   49.666667  37.000      9
1998   48.888889  37.000      9
1999   53.785714  48.500     14
2000   48.093333  46.300     15
2001   43.500000  37.000      6
2002   58.515789  60.000     19
2003   68.829730  66.000     37
2004   70.956863  62.000     51
2005   77.257895  69.000     76
2006   80.700980  74.000    102
2007   78.251387  67.000    173
2008   81.149950  70.000    201
2009   83.308139  73.900    231
2010   85.013573  75.000    375
2011   87.883719  75.000    570
2012   86.536812  78.000    621
2013   88.651108  80.900    668
2014   89.951871  82.000    620
2015   92.668852  88.500    775
2016   92.549562  88.500    856
2017   97.277218  86.795   1010
2018   95.964529  84.000    806
2019  114.106930  88.700    583
2020   93.640135  81.860     74
No description has been provided for this image
torque_nm by year:
            mean  median  count
year                           
1994   82.500000   82.50      2
1995   59.000000   59.00      1
1996   59.000000   59.00      2
1997   71.666667   59.00      9
1998   66.344444   59.00      9
1999   59.035714   59.00     14
2000   64.456250   62.00     16
2001   64.166667   59.00      6
2002  115.715789   62.00     19
2003  115.908108   78.00     37
2004   74.284314   59.00     51
2005   81.931579   62.00     76
2006   74.576471   59.00    102
2007   83.514335   62.00    173
2008   95.944179   84.00    201
2009  120.586667   99.04    231
2010  127.309013  110.00    375
2011  156.955544  145.00    570
2012  166.619163  160.00    621
2013  182.045554  190.00    668
2014  186.246903  190.00    620
2015  186.209923  190.00    775
2016  174.057056  173.75    856
2017  192.094980  190.00   1010
2018  184.826911  170.00    806
2019  197.050137  142.00    583
2020  165.353649  113.75     74
No description has been provided for this image
torque_rpm by year:
             mean  median  count
year                            
1994  2350.000000  2350.0      2
1995  2500.000000  2500.0      1
1996  2500.000000  2500.0      2
1997  3166.666667  2500.0      9
1998  3255.555556  2800.0      9
1999  2536.357143  2500.0     14
2000  2765.812500  2500.0     16
2001  3000.000000  2500.0      6
2002  3044.894737  2500.0     19
2003  2463.864865  2500.0     37
2004  2565.156863  2500.0     51
2005  2454.671053  3000.0     76
2006  1681.941176  1750.0    102
2007  1701.722543  1700.0    173
2008  1838.606965  2000.0    201
2009  2553.571429  2750.0    231
2010  2592.450667  2750.0    375
2011  2454.680702  2000.0    570
2012  2471.716586  2000.0    621
2013  2372.878743  2000.0    668
2014  2448.774194  2000.0    620
2015  2518.033548  1900.0    775
2016  2857.695093  2500.0    856
2017  2638.899010  2000.0   1010
2018  2767.153846  2000.0    806
2019  3092.819897  3500.0    583
2020  3020.270270  3500.0     74
No description has been provided for this image
  1. Average Selling Price by Year
  • Overall Trend: Dramatic exponential increase from 2010 onwards
  • 1983-2010: Relatively flat, ranging from ~50K to 250K
  • 2010-2019: Steady upward climb from 250K to ~1M
  • 2019-2020: Massive spike to 1.75M (likely luxury/premium cars entering the market)
  • 2020-2021: Sharp drop back to ~900K Interpretation: Recent years show much higher car prices, possibly due to inflation, luxury segment growth, or currency effects
  1. Average KM Driven by Year
  • Highly volatile pattern with significant fluctuations
  • Peak at 1991: ~120K km (older cars had more usage)
  • 1996 dip: Drops to ~36K km
  • 2000-2010: Fluctuates between 60K-115K km
  • 2010-2020: Clear downward trend from 100K to ~20K km Interpretation: Newer cars in the dataset have driven fewer kilometers (they're newer/less used)
  1. Average Seats by Year
  • Range: 4-5.6 seats across all years
  • 990s: More variation (4-5.2 seats)
  • 2005-2015: Stabilized around 5.4-5.6 seats (peak SUV/family car popularity)
  • 2015-2020: Slight decline to ~5.2 seats Interpretation: Shift toward larger vehicles (SUVs, MPVs) in 2000s-2010s, then slight return to smaller cars
  1. Average Mileage (Fuel Efficiency) by Year
  • Strong upward trend from 1994 onwards
  • 1994: ~14.5 km/L
  • 2018: Peak at ~21 km/L (45% improvement!)
  • 2018-2021: Slight decline to ~20 km/L Interpretation: Clear technological improvement in fuel efficiency due to better engines, aerodynamics, and hybrid technology
  1. Average Engine CC by Year
  • Highly variable with no clear linear trend
  • 1994: 1,400 cc
  • 1995-1997: Sharp drop to ~800 cc (small car boom)
  • 2005-2013: Rises to 1,400-1,550 cc (larger engines)
  • 2020-2021: Drops to ~1,360 cc Interpretation: Market shifts between small economy cars and larger performance vehicles
  1. Average Max Power (BHP) by Year
  • Strong upward trend indicating more powerful engines
  • 1994: ~45 BHP
  • 2019: Peak at ~114 BHP (2.5x increase!)
  • 2000s: Acceleration in power increase begins Interpretation: Consumer demand for performance; turbocharged engines became common
  1. Average Torque (Nm) by Year
  • Dramatic upward trend especially after 2008
  • 1994: ~82 Nm
  • 2019: ~197 Nm (2.4x increase!)
  • 2008-2019: Steepest increase period Interpretation: Better engine technology (turbocharging, diesel engines) providing more low-end power
  1. Average Torque RPM by Year
  • Most volatile graph with major fluctuations
  • 1998-1999: Peak at ~3,250 RPM
  • 2006-2007: Dramatic drop to ~1,700 RPM (diesel engine adoption?)
  • 2019: Returns to ~3,150 RPM Interpretation: Lower RPM = torque available earlier (diesel characteristic); recent rise suggests petrol/hybrid shift
In [35]:
target = df['selling_price']
In [36]:
# ensure target_col is a column name (string)
if isinstance(target, pd.Series):
    target_col = target.name
else:
    target_col = target

if target_col in df.columns:
    # numeric columns excluding target
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns.tolist() if c != target_col]
    # categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Target vs numeric features (scatter)
    for col in num_cols:
        tmp = df[[col, target_col]].dropna()
        if tmp.empty:
            continue
        fig, ax = plt.subplots(figsize=(6, 4))
        ax.scatter(tmp[col], tmp[target_col], alpha=0.4)
        ax.set_title(f"{target_col} vs {col}")
        ax.set_xlabel(col)
        ax.set_ylabel(target_col)
        plt.tight_layout()
        plt.show()

    # Target vs categorical features (box via grouped data)
    for col in cat_cols:
        # limit to top categories to avoid unreadable plots
        top = df[col].value_counts().head(10).index
        tmp = df[df[col].isin(top)][[col, target_col]].dropna()
        if tmp.empty:
            continue

        groups = [tmp.loc[tmp[col] == k, target_col].values for k in top]
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
        ax.set_title(f"{target_col} by {col} (top 10 categories)")
        ax.set_xlabel(col)
        ax.set_ylabel(target_col)
        ax.tick_params(axis="x", rotation=45)
        plt.tight_layout()
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
No description has been provided for this image
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
No description has been provided for this image
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
No description has been provided for this image
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
No description has been provided for this image
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
No description has been provided for this image

FEATURE ENGINEERING¶

In [37]:
#The dataset max year is 2020, so we will use that as the current year to calculate the age of the car. We will then drop the year column as it is no longer needed.
current_year = 2020
df["car_age"] = current_year - df["year"]
df.drop(columns=["year"], inplace=True)
In [38]:
# Normalizing the km_driven by the age of the car to get a better sense of how much the car has been driven per year. 
# This can help us understand if a car has been driven more than average for its age, which could affect its price.
df["km_per_year"] = df["km_driven"] / (df["car_age"] + 1)
In [39]:
# Engine power to engine size ratio, which can be an indicator of the car's performance. 
# A higher ratio may indicate a more powerful engine relative to its size, which could affect the car's price.
df["power_to_engine"] = df["max_power_bhp"] / (df["engine_cc"] + 1)
In [40]:
# Mapping the owner column to numeric values, where "First Owner" is 1, "Second Owner" is 2, and so on.
# This allows us to use the owner information in a numerical format for modeling, while still capturing the ordinal nature of ownership 
owner_map = {
    "First Owner": 1,
    "Second Owner": 2,
    "Third Owner": 3,
    "Fourth & Above Owner": 4,
    "Test Drive Car": 0
}

df["owner_num"] = df["owner"].map(owner_map)
df.drop(columns=["owner"], inplace=True)
In [41]:
# Extracting the brand from the name column, 
# which can be a useful feature for modeling as certain brands may have higher resale values than others.
df["brand"] = df["name"].apply(lambda x: x.split()[0])
df.drop(columns=["name"], inplace=True)
In [42]:
df = pd.get_dummies(df, columns=["brand"], drop_first=True)
In [43]:
# Encoding the transmission column as a binary variable, where "Automatic" is 1 and "Manual" is 0.
df["is_automatic"] = (df["transmission"] == "Automatic").astype(int)
df.drop(columns=["transmission"], inplace=True)
In [44]:
# Encoding the fuel column using one-hot encoding, which creates binary columns for each fuel type. We drop the first category to avoid multicollinearity.
df = pd.get_dummies(df, columns=["fuel"], drop_first=True)
In [45]:
# Encoding the seller_type column using one-hot encoding, which creates binary columns for each seller type. We drop the first category to avoid multicollinearity.
df = pd.get_dummies(df, columns=["seller_type"], drop_first=True)
In [46]:
# Log-transforming the selling_price to reduce skewness and make the distribution more normal, which can help improve the performance of linear models.
df["selling_price"] = np.log1p(df["selling_price"])
In [47]:
df.head()
Out[47]:
selling_price km_driven seats mileage_num engine_cc max_power_bhp torque_nm torque_rpm car_age km_per_year ... brand_Tata brand_Toyota brand_Volkswagen brand_Volvo is_automatic fuel_Diesel fuel_LPG fuel_Petrol seller_type_Individual seller_type_Trustmark Dealer
0 13.017005 145500 5.0 23.40 1248.0 74.00 190.0 2000.0 6 20785.714286 ... False False False False 0 True False False True False
1 12.821261 120000 5.0 21.14 1498.0 103.52 250.0 1500.0 6 17142.857143 ... False False False False 0 True False False True False
2 11.970357 140000 5.0 17.70 1497.0 78.00 12.7 2.0 14 9333.333333 ... False False False False 0 False False True True False
3 12.323860 127000 5.0 23.00 1396.0 90.00 22.4 2750.0 10 11545.454545 ... False False False False 0 True False False True False
4 11.775297 120000 5.0 16.10 1298.0 88.20 11.5 4.0 13 8571.428571 ... False False False False 0 False False True True False

5 rows × 48 columns

In [48]:
df = df.drop(columns=["torque_nm", "torque_rpm"])
In [49]:
df.columns
Out[49]:
Index(['selling_price', 'km_driven', 'seats', 'mileage_num', 'engine_cc',
       'max_power_bhp', 'car_age', 'km_per_year', 'power_to_engine',
       'owner_num', 'brand_Ashok', 'brand_Audi', 'brand_BMW',
       'brand_Chevrolet', 'brand_Daewoo', 'brand_Datsun', 'brand_Fiat',
       'brand_Force', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Isuzu', 'brand_Jaguar', 'brand_Jeep', 'brand_Kia', 'brand_Land',
       'brand_Lexus', 'brand_MG', 'brand_Mahindra', 'brand_Maruti',
       'brand_Mercedes-Benz', 'brand_Mitsubishi', 'brand_Nissan', 'brand_Opel',
       'brand_Renault', 'brand_Skoda', 'brand_Tata', 'brand_Toyota',
       'brand_Volkswagen', 'brand_Volvo', 'is_automatic', 'fuel_Diesel',
       'fuel_LPG', 'fuel_Petrol', 'seller_type_Individual',
       'seller_type_Trustmark Dealer'],
      dtype='object')
In [50]:
df = df.drop(columns=[
    "km_driven",
    "engine_cc"
])
In [51]:
df.columns
Out[51]:
Index(['selling_price', 'seats', 'mileage_num', 'max_power_bhp', 'car_age',
       'km_per_year', 'power_to_engine', 'owner_num', 'brand_Ashok',
       'brand_Audi', 'brand_BMW', 'brand_Chevrolet', 'brand_Daewoo',
       'brand_Datsun', 'brand_Fiat', 'brand_Force', 'brand_Ford',
       'brand_Honda', 'brand_Hyundai', 'brand_Isuzu', 'brand_Jaguar',
       'brand_Jeep', 'brand_Kia', 'brand_Land', 'brand_Lexus', 'brand_MG',
       'brand_Mahindra', 'brand_Maruti', 'brand_Mercedes-Benz',
       'brand_Mitsubishi', 'brand_Nissan', 'brand_Opel', 'brand_Renault',
       'brand_Skoda', 'brand_Tata', 'brand_Toyota', 'brand_Volkswagen',
       'brand_Volvo', 'is_automatic', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol',
       'seller_type_Individual', 'seller_type_Trustmark Dealer'],
      dtype='object')
In [60]:
df = df.dropna()
df.isna().sum()
Out[60]:
selling_price                   0
seats                           0
mileage_num                     0
max_power_bhp                   0
car_age                         0
km_per_year                     0
power_to_engine                 0
owner_num                       0
brand_Ashok                     0
brand_Audi                      0
brand_BMW                       0
brand_Chevrolet                 0
brand_Daewoo                    0
brand_Datsun                    0
brand_Fiat                      0
brand_Force                     0
brand_Ford                      0
brand_Honda                     0
brand_Hyundai                   0
brand_Isuzu                     0
brand_Jaguar                    0
brand_Jeep                      0
brand_Kia                       0
brand_Land                      0
brand_Lexus                     0
brand_MG                        0
brand_Mahindra                  0
brand_Maruti                    0
brand_Mercedes-Benz             0
brand_Mitsubishi                0
brand_Nissan                    0
brand_Opel                      0
brand_Renault                   0
brand_Skoda                     0
brand_Tata                      0
brand_Toyota                    0
brand_Volkswagen                0
brand_Volvo                     0
is_automatic                    0
fuel_Diesel                     0
fuel_LPG                        0
fuel_Petrol                     0
seller_type_Individual          0
seller_type_Trustmark Dealer    0
dtype: int64
In [61]:
X = df.drop(columns=["selling_price"])
y = df["selling_price"]
In [62]:
print(X)
      seats  mileage_num  max_power_bhp  car_age   km_per_year  \
0       5.0        23.40          74.00        6  20785.714286   
1       5.0        21.14         103.52        6  17142.857143   
2       5.0        17.70          78.00       14   9333.333333   
3       5.0        23.00          90.00       10  11545.454545   
4       5.0        16.10          88.20       13   8571.428571   
...     ...          ...            ...      ...           ...   
8123    5.0        18.50          82.85        7  13750.000000   
8124    5.0        16.80         110.00       13   8500.000000   
8125    5.0        19.30          73.90       11  10000.000000   
8126    5.0        23.57          70.00        7   3125.000000   
8127    5.0        23.57          70.00        7   3125.000000   

      power_to_engine  owner_num  brand_Ashok  brand_Audi  brand_BMW  ...  \
0            0.059247          1        False       False      False  ...   
1            0.069059          2        False       False      False  ...   
2            0.052069          3        False       False      False  ...   
3            0.064424          1        False       False      False  ...   
4            0.067898          1        False       False      False  ...   
...               ...        ...          ...         ...        ...  ...   
8123         0.069157          1        False       False      False  ...   
8124         0.073628          4        False       False      False  ...   
8125         0.059167          1        False       False      False  ...   
8126         0.050107          1        False       False      False  ...   
8127         0.050107          1        False       False      False  ...   

      brand_Tata  brand_Toyota  brand_Volkswagen  brand_Volvo  is_automatic  \
0          False         False             False        False             0   
1          False         False             False        False             0   
2          False         False             False        False             0   
3          False         False             False        False             0   
4          False         False             False        False             0   
...          ...           ...               ...          ...           ...   
8123       False         False             False        False             0   
8124       False         False             False        False             0   
8125       False         False             False        False             0   
8126        True         False             False        False             0   
8127        True         False             False        False             0   

      fuel_Diesel  fuel_LPG  fuel_Petrol  seller_type_Individual  \
0            True     False        False                    True   
1            True     False        False                    True   
2           False     False         True                    True   
3            True     False        False                    True   
4           False     False         True                    True   
...           ...       ...          ...                     ...   
8123        False     False         True                    True   
8124         True     False        False                    True   
8125         True     False        False                    True   
8126         True     False        False                    True   
8127         True     False        False                    True   

      seller_type_Trustmark Dealer  
0                            False  
1                            False  
2                            False  
3                            False  
4                            False  
...                            ...  
8123                         False  
8124                         False  
8125                         False  
8126                         False  
8127                         False  

[7906 rows x 43 columns]
In [63]:
print(y)
0       13.017005
1       12.821261
2       11.970357
3       12.323860
4       11.775297
          ...    
8123    12.676079
8124    11.813037
8125    12.853179
8126    12.577640
8127    12.577640
Name: selling_price, Length: 7906, dtype: float64
In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
In [65]:
Lr = LinearRegression()
In [66]:
Lr.fit(X_train, y_train)
Out[66]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [67]:
train_data = Lr.predict(X_train)
In [68]:
# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.0598
Training R^2: 0.9127
In [72]:
plt.scatter(y_train, train_data, alpha=0.5)
plt.xlabel("Actual Selling Price (log-transformed)")
plt.ylabel("Predicted Selling Price (log-transformed)")
plt.title("Actual vs Predicted Selling Price (Training Data)")
plt.show()
No description has been provided for this image
In [70]:
test_data = Lr.predict(X_test)
In [71]:
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.0647
Training R^2: 0.9052

Lasso Regresiion¶

In [73]:
# Lasso regression with alpha=0.1
Lasso_model = Lasso(alpha=0.1)
Lasso_model.fit(X_train, y_train)
Out[73]:
Lasso(alpha=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Lasso(alpha=0.1)
In [75]:
train_data = Lasso_model.predict(X_train)
In [76]:
# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.1112
Training R^2: 0.8377
In [77]:
test_data = Lasso_model.predict(X_test)
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.1098
Training R^2: 0.8392

Ridge Regression¶

In [78]:
Ridge_model = Ridge(alpha=1.0)
Ridge_model.fit(X_train, y_train)
train_data = Ridge_model.predict(X_train)
In [79]:
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.0600
Training R^2: 0.9125
In [80]:
test_data = Ridge_model.predict(X_test)
In [81]:
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.0644
Training R^2: 0.9056
In [ ]:
 
In [ ]: